# get_population.R
#
# creates a gams input file containing the household count in each income class
# by state using the cps. gams input file is stored in
# build/data/population_data.gms
#



# ------------------------------------------------------------------------------
# load libraries
# ------------------------------------------------------------------------------

# if the lodown package (used to download and load the cps data) is not
# installed run the following three commands
#install.packages( "devtools" , repos = "http://cran.rstudio.com/" )
#library(devtools)
#install_github( "ajdamico/lodown" , dependencies = TRUE )

library(lodown)
library(tidyr)

# load in the R utilities for sage
source(file.path("utilities","R_utilities.R"))



# ------------------------------------------------------------------------------
# set general parameters
# ------------------------------------------------------------------------------

# year of the benchmark
year = 2016

# directory where data files will be stored
data.dir = file.path("build","data")

# scaling factor -> households to million households
scale = 1e-6

# upper bounds of the income bins in implan along with the set indexes
hh.inc = data.frame(label=c("hhl","hh15","hh30","hh40","hh50", "hh70","hh100","hh150","hh200"),
                    value=c(15000, 30000, 40000, 50000, 70000, 100000,150000, 200000,    Inf))



# ------------------------------------------------------------------------------
# load the cps asec data
# ------------------------------------------------------------------------------

# download and process the cps march supplement for the year after the benchmark
# year since income questions on the supplement are for the previous year
cpsasec_cat = get_catalog("cpsasec",output_dir=data.dir)
cpsasec_cat = cpsasec_cat[cpsasec_cat$year==(year+1),]
cpsasec_cat = lodown("cpsasec",cpsasec_cat)
cpsasec = readRDS(file.path(data.dir,paste0(year+1," cps asec.rds")))

# add the state abbreviations to the data using the fips codes
cpsasec$state = factor(cpsasec$gestfips,levels = c(  1L,  2L,  4L,  5L,  6L,
                                                     8L,  9L, 10L, 11L, 12L,
                                                     13L, 15L, 16L, 17L, 18L,
                                                     19L, 20L, 21L, 22L, 23L,
                                                     24L, 25L, 26L, 27L, 28L,
                                                     29L, 30L, 31L, 32L, 33L,
                                                     34L, 35L, 36L, 37L, 38L,
                                                     39L, 40L, 41L, 42L, 44L,
                                                     45L, 46L, 47L, 48L, 49L,
                                                     50L, 51L, 53L, 54L, 55L,
                                                     56L),
                                        labels = c("al","ak","az","ar","ca",
                                                   "co","ct","de","dc","fl",
                                                   "ga","hi","id","il","in",
                                                   "ia","ks","ky","la","me",
                                                   "md","ma","mi","mn","ms",
                                                   "mo","mt","ne","nv","nh",
                                                   "nj","nm","ny","nc","nd",
                                                   "oh","ok","or","pa","ri",
                                                   "sc","sd","tn","tx","ut",
                                                   "vt","va","wa","wv","wi",
                                                   "wy"))

# extract the household file with representative persons
cpsasec = cpsasec[cpsasec$a_exprrp %in% c(1,2) & cpsasec$h_hhtype==1,]

# remove the cps data file
file.remove(file.path(data.dir,paste0(year+1," cps asec.rds")))



# ------------------------------------------------------------------------------
# write the benchmark population input file
# ------------------------------------------------------------------------------

# get the estimated number of households in each income bin by state
hh.pop = data.frame(r=unique(cpsasec$state))
for (i in 1:nrow(hh.pop)) {
  for (j in 1:nrow(hh.inc)) {
    if (j==1)
      bottom = -Inf
    else
      bottom = hh.inc$value[j-1]
    top = hh.inc$value[j]
    hh.pop[i,as.character(hh.inc$label[j])] = sum(cpsasec$marsupwt[cpsasec$htotval>bottom &
                                                     cpsasec$htotval<=top &
                                                     cpsasec$state==hh.pop$r[i]])
  }
}

# scale counts
hh.pop[,-1] = scale*hh.pop[,-1]

# write a gams input file with the population by income and state
write.gams.variable(file.path("build","data","population_data.gms"),"n0",
                    gather_(hh.pop,"h","value",names(hh.pop)[-1]),
                    comments="benchmark - households by state and income")
